# load raw data files
data <- read.csv("../data/filledDatabase.csv")[,-c(2:9,11:13)]
# clean data
data <- clean_data(data) %>% collapse_data()
# separate compound and group_cate from the predictors
compound <- data$Compound
group_cat <- data$GroupCat
space_group <- data$SpaceGroup
# create data constructed by first 13 PC's
data <- select(data, -c("Compound","X","Z","SpaceGroup","SpaceGroupNumber"))
# data_pca <- get_pc_space(data[,-1], k = 13) %>% scale() %>% data.frame()
# split data into 5 folds for cross validation later
folds <- caret::createFolds(1:nrow(data), k = 5, list = TRUE, returnTrain = FALSE)
Multinomial Regression
library(glmnet)
X = data[,-1] %>% as.matrix()
Y = data$GroupCat %>% as.matrix()
Coefficient
Ridge
ridge_cv <- cv.glmnet(x = X, y = Y, alpha = 0, nfolds = 5, type.measure = "deviance", family = "multinomial")
ridge_cv %>%
get_coef(tuning_parameter = ridge_cv$lambda.min) %>%
select(feature, Cubic, Tilted, Others) %>%
filter(feature != "(Intercept)") %>%
plot_coef()

LASSO
lasso_cv <- cv.glmnet(x = X, y = Y, alpha = 1, nfolds = 5, type.measure = "deviance", family = "multinomial")
lasso_cv %>%
get_coef(tuning_parameter = lasso_cv$lambda.min) %>%
select(feature, Cubic, Tilted, Others) %>%
filter(feature != "(Intercept)") %>%
plot_coef()

Elastic Net
library(caret)
elastic_cv <-
train(GroupCat ~., data = data, method = "glmnet",
trControl = trainControl("cv", number = 5),
tuneLength = 10
)
elastic_cv$finalModel %>%
get_coef(tuning_parameter = elastic_cv$bestTune$lambda) %>%
select(feature, Cubic, Tilted, Others) %>%
filter(feature != "(Intercept)") %>%
plot_coef()

Accurate classification rate
Ridge
tb_ridge = prediction_table(alpha = 0, lambda = ridge_cv$lambda.min)
tb_ridge$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.9466667
|
0.92
|
0.890411
|
0.9594595
|
0.8918919
|
0.9216858
|
tb_ridge$t %>% highlight_tb_count()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
171
|
3
|
11
|
|
Others
|
2
|
29
|
2
|
|
Tilted
|
6
|
5
|
142
|
|
Total
|
179
|
37
|
155
|
tb_ridge$t %>% highlight_tb_percent()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
0.96
|
0.08
|
0.07
|
|
Others
|
0.01
|
0.78
|
0.01
|
|
Tilted
|
0.03
|
0.14
|
0.92
|
|
Total
|
100%
|
100%
|
100%
|
tb_ridge$t %>%
as.data.frame() %>%
arrange(desc(Freq))
## Var1 Var2 Freq
## 1 Cubic Cubic 171
## 2 Tilted Tilted 142
## 3 Others Others 29
## 4 Cubic Tilted 11
## 5 Tilted Cubic 6
## 6 Tilted Others 5
## 7 Cubic Others 3
## 8 Others Cubic 2
## 9 Others Tilted 2
LASSO
tb_lasso = prediction_table(alpha = 1, lambda = lasso_cv$lambda.min)
tb_lasso$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.9466667
|
0.9466667
|
0.9315068
|
0.9324324
|
0.972973
|
0.9460491
|
tb_lasso$t %>% highlight_tb_count()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
177
|
3
|
7
|
|
Others
|
1
|
29
|
3
|
|
Tilted
|
1
|
5
|
145
|
|
Total
|
179
|
37
|
155
|
tb_lasso$t %>% highlight_tb_percent()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
0.99
|
0.08
|
0.05
|
|
Others
|
0.01
|
0.78
|
0.02
|
|
Tilted
|
0.01
|
0.14
|
0.94
|
|
Total
|
100%
|
100%
|
100%
|
Elastic Net
tb_elastic = prediction_table(alpha = elastic_cv$bestTune[[1]], lambda = elastic_cv$bestTune[[2]])
tb_elastic$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.9466667
|
0.96
|
0.9589041
|
0.9324324
|
0.9594595
|
0.9514925
|
tb_elastic$t %>% highlight_tb_count()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
176
|
2
|
3
|
|
Others
|
2
|
29
|
4
|
|
Tilted
|
1
|
6
|
148
|
|
Total
|
179
|
37
|
155
|
tb_elastic$t %>% highlight_tb_percent()
|
|
Cubic
|
Others
|
Tilted
|
|
Cubic
|
0.98
|
0.05
|
0.02
|
|
Others
|
0.01
|
0.78
|
0.03
|
|
Tilted
|
0.01
|
0.16
|
0.95
|
|
Total
|
100%
|
100%
|
100%
|